We will do PCA on a Major League Baseball dataset from fangraphs.com
https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html
# Get working directory
import os
os.getcwd()
import pandas as pd
import seaborn as sns
%matplotlib inline
sns.set(style='white', rc={'figure.figsize':(20,20)})
import matplotlib.pyplot as plt
import numpy as np
# Read in csv file for World War 2 weather conditions that is stored in path:
import pandas as pd
pd.set_option('display.max_columns', None)
mlb_df = pd.read_csv('/Users/matthewberezo/Documents/FanGraphs_Leaderboard.csv')
mlb_df.head()
mlb_df.shape
# Load libraries for PCA
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn import datasets
# Standardize the feature matrix
features = StandardScaler().fit_transform(mlb_df[['HR', 'R', 'SB', 'ISO', 'BABIP', 'wRC+', 'wOBA', 'Off', 'Def']])
# Create a PCA that will retain 99% of variance
pca = PCA(n_components= 5)
# Conduct PCA
features_pca = pca.fit_transform(features)
# Show results
print("Original number of features:", features.shape[1])
print("Reduced number of features:", features_pca.shape[1])
print(pd.DataFrame(pca.components_))
print(pca.explained_variance_ratio_)
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance')
# Let's check the shape of our features_pca
features_pca.shape
features_pca_df = pd.DataFrame(features_pca)
features_pca_df.head()
mlb_df
features_pca_df
# Let's plot principle components with player names labeled:
features_pca_df = features_pca_df.add_prefix('pca')
mlb_df_pca = pd.concat([mlb_df, features_pca_df], axis = 1)
mlb_df_pca
plt.scatter(mlb_df_pca.pca0, mlb_df_pca.pca1, s=mlb_df.WAR**3)
# We can also do ths in seaborn and label our points
plot_w_text = sns.regplot(data = mlb_df_pca, x = 'pca0', y = 'pca1', color = 'red', fit_reg = False)
for line in range(0,mlb_df_pca.shape[0]):
plot_w_text.text(mlb_df_pca.pca0[line]+0.2, mlb_df_pca.pca1[line], mlb_df_pca.Name[line], horizontalalignment='left', size='medium', color='black', weight='semibold')
https://scikit-learn.org/stable/modules/lda_qda.html
Again, LDA can be used for supervised dimensionality reduction by projecting the input data to a linear subspace consisting of the directions which maximize the separation between classes.
We will again use our wine dataset for LDA:
wine_df = pd.read_csv('/Users/matthewberezo/Documents/wineQualityReds.csv')
wine_df = wine_df.drop(['Unnamed: 0'], axis=1)
wine_df.head()
wine_df.shape
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
# Create and run an LDA
lda = LinearDiscriminantAnalysis(n_components=2)
wine_df['quality'].unique()
x_lda = lda.fit(wine_df[wine_df.columns[0:11]], wine_df['quality']).transform(wine_df[wine_df.columns[0:11]])
# Print the number of features
print('Original number of features:', wine_df[wine_df.columns[0:11]].shape[1])
print('Reduced number of features:', x_lda.shape[1])
# Create array of explained variance ratios
lda_var_ratios = lda.explained_variance_ratio_
lda_var_ratios
lda_df = pd.DataFrame(x_lda)
lda_df.shape
lda_df = lda_df.add_prefix('lda')
lda_df.head()
wine_df_lda = pd.concat([wine_df, lda_df], axis = 1)
wine_df_lda.head()
lda_plot = sns.lmplot(data = wine_df_lda, x = 'lda0', y = 'lda1', hue = 'quality', fit_reg = False)
https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html
K-means clustering is a non-supervised approach for partitioning a dataset into K distinct, non-overlapping clusters (i.e., "grouping" data)
# We will reuse the MLB data for this exercise
mlb_df.head()
# Make k-means clusterer
from sklearn.cluster import KMeans
clusterer = KMeans(7, random_state=1)
# We first will want to rescale our variables
from sklearn.preprocessing import MinMaxScaler
mms = MinMaxScaler()
mms.fit(mlb_df[['HR', 'R', 'RBI', 'SB', 'wRC+', 'BsR', 'Off', 'Def', 'WAR']])
data_transformed = mms.transform(mlb_df[['HR', 'R', 'RBI', 'SB', 'wRC+', 'BsR', 'Off', 'Def', 'WAR']])
# Fit clusterer
clusterer.fit(data_transformed)
# Predict values
mlb_df['clust_grp'] = clusterer.predict(data_transformed)
mlb_df['clust_grp'].unique()
mlb_df = mlb_df[(mlb_df['WAR'] >= 0)]
sns.lmplot(data = mlb_df, x = 'Def', y = 'Off', hue = 'clust_grp', fit_reg = False)
# Let's recreate this scatterplot with plotly
import plotly.express as px
fig = px.scatter(mlb_df, x="Def", y="Off", color="clust_grp"#,
,size= 'WAR'
,hover_data=['Name'])
fig.show()
# We can also evaluate k-means clustering with SSE
# k means determine k
from sklearn import metrics
Sum_of_squared_distances = []
K = range(1,15)
for k in K:
km = KMeans(n_clusters=k)
km = km.fit(data_transformed)
Sum_of_squared_distances.append(km.inertia_)
plt.plot(K, Sum_of_squared_distances, 'bx-')
plt.xlabel('k')
plt.ylabel('Sum_of_squared_distances')
plt.title('Elbow Method For Optimal k')
plt.show()
What is the difference between fit() and fit_transform()? : https://datascience.stackexchange.com/questions/12321/difference-between-fit-and-fit-transform-in-scikit-learn-models
TSNE documentation: https://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html
# Let's use the wine dataset and same variables to see if we get more defined clusters
from sklearn.manifold import TSNE
x_embedded = TSNE(n_components = 3
,perplexity = 25
,learning_rate = 1000#,
#n_iter = 10000
#n_iter_without_progress = ???
).fit_transform(wine_df[wine_df.columns[0:11]])
pd.DataFrame(x_embedded).head()
x_embedded_df = pd.DataFrame(x_embedded)
x_embedded_df = x_embedded_df.add_prefix('tsne')
x_embedded_df.head()
wine_df_tsne = pd.concat([wine_df, x_embedded_df], axis = 1)
wine_df_tsne.head()
# Let's recreate this scatterplot with plotly
import plotly.express as px
fig = px.scatter(wine_df_tsne, x="tsne0", y="tsne1", color="quality")
fig.show()
UMAP documentation: https://umap-learn.readthedocs.io/en/latest/
import umap
reducer = umap.UMAP(n_neighbors = 5
,min_dist = .75
,n_components = 3,
metric = 'euclidean'
)
wine_df[wine_df.columns[0:11]].head()
wine_embedding = reducer.fit_transform(wine_df[wine_df.columns[0:11]]
)
wine_embedding.shape
pd.DataFrame(wine_embedding).head()
%matplotlib inline
sns.set(style='white', rc={'figure.figsize':(25,25)})
plt.scatter(wine_embedding[:,0], wine_embedding[:,1], c = wine_df["quality"])
# let's perform UMAP with a target
embedding = umap.UMAP(n_neighbors = 200,
min_dist = 0.15
,metric = 'hamming'
#,n_components = 5
).fit_transform(wine_df[wine_df.columns[0:11]], y=wine_df["quality"]
)
embedding.shape
umap_embedding_df = pd.DataFrame(embedding)
umap_embedding_df = umap_embedding_df.add_prefix('umap')
umap_embedding_df.head()
wine_umap_df = pd.concat([wine_df, umap_embedding_df], axis = 1)
fig = px.scatter(wine_umap_df, x="umap0", y="umap1", color="quality")
fig.show()